Author: Amalie Skålevåg, skalevag2@uni-potsdam.de
Evaluating results of cluster analysis on automatically detected events, with my own event characteristics.
# modules
import random
from functools import partial
from pathlib import Path
import joblib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.linalg
import scipy.stats
import shap
import sklearn.metrics
from IPython.display import Image
from matplotlib.backends.backend_pdf import PdfPages
import hysevt.clustering.metrics_clustering
import hysevt.events.metrics
from hysevt.utils import tools, visualise
# set base directory
baseDIR = Path("..").resolve()
assert baseDIR.is_dir()
# data directory
dataDIR = "/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/data"
dataDIR = Path(dataDIR)
localDataDIR = baseDIR.joinpath("data")
# station
station_name = "vent"
station_id = 201350 # station id for Vent-Rofental gauging station, as used by HD Tirol
# events
events_version = "automatic_local_minimum_events"
eventsDIR = localDataDIR.joinpath("events_database", station_name, events_version)
file_events = eventsDIR.joinpath(
f"{station_id}_{station_name}_event_metrics.csv",
)
# results
resDIR = baseDIR.joinpath("results")
assert resDIR.is_dir()
figDIR = baseDIR.joinpath("figures")
assert figDIR.is_dir()
cluster_type = "metrics_clustering"
algorithm = "gaussian_mixture_model"
input_data = "reduced"
clusteringDIR = resDIR.joinpath(
f"{events_version}_{station_name}", cluster_type, algorithm
)
annual_SSY = pd.read_csv(
list(
localDataDIR.joinpath("preprocessed").glob(
f"{station_id}_{station_name}_annual_sediment_yield.csv"
)
)[0],
index_col=0,
)
# files
file_data = localDataDIR.joinpath(
"preprocessed", f"{station_id}_{station_name}_SSC_Q.csv"
)
# data
gauge_data = hysevt.events.metrics.get_gauge_data(file_data)
training_data = pd.read_csv(
resDIR.joinpath(
f"{events_version}_{station_name}",
cluster_type,
f"training_data_{input_data}.csv",
),
index_col=0,
)
X_train = np.load(
resDIR.joinpath(
f"{events_version}_{station_name}", cluster_type, f"X_{input_data}.npy"
)
)
training_features = tools.load_list_from_txt(
resDIR.joinpath(
f"{events_version}_{station_name}", cluster_type, "training_data_features.txt"
)
)
assert X_train.shape[0] == len(training_data)
event_ids = training_data.index
runs = list(clusteringDIR.glob("run_20230615*"))
fit_results = {}
for runDIR in runs:
models = list(runDIR.glob(f"*{input_data}*.joblib"))
try:
cov_type = tools.load_dict_from_json(runDIR.joinpath("settings.json"))[
"covariance_type"
]
except KeyError:
continue
print(cov_type)
output = []
for model in models:
try:
output.append(
(
joblib.load(model).n_components,
joblib.load(model).bic(training_data),
sklearn.metrics.calinski_harabasz_score(
training_data, joblib.load(model).predict(training_data)
),
sklearn.metrics.silhouette_score(
training_data, joblib.load(model).predict(training_data)
),
joblib.load(model).covariances_.size
+ joblib.load(model).means_.size
+ joblib.load(model).weights_.size,
)
)
except ValueError:
output.append(
(
joblib.load(model).n_components,
joblib.load(model).bic(training_data),
np.nan,
np.nan,
np.nan,
)
)
fit_results[f"{runDIR.name}_{cov_type}"] = pd.DataFrame(
output,
columns=[
"k",
"BIC",
"calinski_harabasz_score",
"silhouette_score",
"parameters",
],
).sort_values("k")
spherical diag full tied
fig, ax = plt.subplots(nrows=3, figsize=(5, 7), sharex=True)
for i, score in enumerate(["BIC", "calinski_harabasz_score", "silhouette_score"]):
for run in fit_results:
if "tied" in run:
continue
fit_results[run].plot(
x="k", y=score, label=run.split("_")[-1], ax=ax[i], legend=False
)
if i == 0:
ax[i].legend(bbox_to_anchor=(1, 1), title="Covariance type")
ax[i].set_ylabel(score)
ax[i].set_xticks(fit_results[run]["k"])
fig, ax = plt.subplots()
for run in fit_results:
fit_results[run].plot(
x="k", y="parameters", label=run.split("_")[-1], ax=ax, legend=False
)
ax.legend(bbox_to_anchor=(1, 1))
ax.set_ylabel("number of fitted parameters")
ax.set_xticks(fit_results[run]["k"])
ax.hlines(
training_data.shape[0],
min(fit_results[run]["k"]),
max(fit_results[run]["k"]),
color="grey",
)
plt.xlim(2, 20)
(2.0, 20.0)
k = 4
fig, ax = plt.subplots(nrows=2, sharex=True)
for i, score in enumerate(["BIC", "calinski_harabasz_score"]):
sel_runs = []
for run in fit_results:
if "tied" in run:
continue
else:
sel_runs.append(run)
fit_results[run].plot(x="k", y=score, label=run, ax=ax[i], legend=False)
if i == 0:
ax[i].legend(bbox_to_anchor=(1, 1), title="Covariance type")
ax[i].set_ylabel(score)
ax[i].set_xticks(fit_results[run]["k"])
ax[i].vlines(
k,
min([fit_results[run][score].min() for run in sel_runs]),
max([fit_results[run][score].max() for run in sel_runs]),
color="grey",
)
plt.savefig(
figDIR.joinpath(f"model_selection_{cluster_type}_{algorithm}_{input_data}.png"),
dpi=300,
bbox_inches="tight",
)
if algorithm == "gaussian_mixture_model":
print(list(clusteringDIR.glob("run_*")))
run_id = "202306151133" # clustering run id
clusterDIR = clusteringDIR.joinpath(f"run_{run_id}")
else:
clusterDIR = clusteringDIR
assert clusterDIR.is_dir()
clusterDIR
[PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151133'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151138'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202304171429'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151143'), PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151142')]
PosixPath('/home/skalevag/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/results/automatic_local_minimum_events_vent/metrics_clustering/gaussian_mixture_model/run_202306151133')
The number of clusters may need to be evaluated for some clustering algorithms.
settings = tools.load_dict_from_json(clusterDIR.joinpath("settings.json"))
settings
{'n_init': 100,
'init_params': 'random_from_data',
'verbose': 0,
'covariance_type': 'spherical'}
For the test example the number of clusters were determined:
# number of clusters
k = 4
print(k)
# cluster colors
cluster_colors = plt.cm.rainbow(np.linspace(0, 1, k))
4
# save results from best clustering to subfolder
ODIR = clusterDIR.joinpath(f"best_result_{input_data}_{k}_clusters")
if not ODIR.is_dir():
ODIR.mkdir()
event_metrics = pd.read_csv(file_events, index_col=0)
event_metrics.start = pd.to_datetime(event_metrics.start)
event_metrics.end = pd.to_datetime(event_metrics.end)
event_metrics = event_metrics.set_index("event_id")
event_metrics.head()
| start | end | duration | seasonal_timing | year | SSY | pSSY | SSC_max | SSC_mean | SSC_mean_weighted | ... | Q_max_log | Q_mean_log | Q_max_previous_ratio | time_since_last_event | last_event_SSY_elapsed_time_logratio | last_event_Qtotal_elapsed_time_logratio | last_event_Q_max_elapsed_time_logratio | last_event_SSC_max_elapsed_time_logratio | SHI | AHI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| event_id | |||||||||||||||||||||
| VEN-AUT-2006-001 | 2006-05-18 08:45:00 | 2006-05-19 09:45:00 | 25.00 | 138 | 2006 | 1069.088754 | 0.004492 | 3736.9221 | 1079.428631 | 1214.423695 | ... | 2.700690 | 2.270532 | NaN | NaN | NaN | NaN | NaN | NaN | 0.034235 | 0.586656 |
| VEN-AUT-2006-002 | 2006-05-28 10:15:00 | 2006-06-05 09:30:00 | 191.25 | 148 | 2006 | 1806.702578 | 0.007591 | 2508.0991 | 483.575956 | 641.000629 | ... | 2.523326 | 1.408162 | 0.177364 | 216.50 | 1.596971 | 8.310457 | -2.676901 | 2.848427 | -0.128928 | 0.447181 |
| VEN-AUT-2006-003 | 2006-06-13 09:00:00 | 2006-06-14 08:45:00 | 23.75 | 164 | 2006 | 882.157443 | 0.003706 | 1874.1661 | 931.966941 | 1011.271536 | ... | 2.635480 | 2.312174 | -0.112154 | 191.50 | 2.244371 | 9.596851 | -2.731562 | 2.572393 | 0.037343 | 0.242092 |
| VEN-AUT-2006-004 | 2006-06-14 09:00:00 | 2006-06-15 09:30:00 | 24.50 | 165 | 2006 | 1041.399567 | 0.004375 | 1799.2461 | 996.552409 | 1043.353769 | ... | 2.687847 | 2.416121 | -0.052368 | 0.25 | 8.168665 | 15.065212 | 4.021774 | 8.922213 | 0.061433 | 0.032412 |
| VEN-AUT-2006-005 | 2006-06-15 09:45:00 | 2006-06-16 09:00:00 | 23.25 | 166 | 2006 | 1144.288279 | 0.004808 | 1905.8293 | 1073.529931 | 1111.614591 | ... | 2.758109 | 2.498790 | -0.070262 | 0.25 | 8.334615 | 15.199930 | 4.074142 | 8.881417 | 0.005221 | -0.011580 |
5 rows × 35 columns
data = np.load(
eventsDIR.joinpath(
f"{station_id}_{station_name}_preprocessed_events_for_METS_clustering.npy"
)
)
data_event_id = tools.load_list_from_txt(
eventsDIR.joinpath(f"{station_id}_{station_name}_event_id_list.txt")
)
mask = np.array([ID in event_ids for ID in data_event_id])
data = data[mask]
model = joblib.load(
clusterDIR.joinpath(f"{algorithm}_{k}_clusters_{input_data}_data.joblib")
)
if algorithm == "gaussian_mixture_model":
y = model.predict(training_data)
prob = model.predict_proba(training_data)
cluster_prob = pd.DataFrame(
prob, index=event_ids, columns=[f"prob_cluster_{c}" for c in range(k)]
)
else:
y = model.labels_
labels = pd.Series(y, index=event_ids).rename("cluster_id")
event_metrics_labels = event_metrics.join(labels)
help(hysevt.clustering.evaluation.calc_metric_zscores)
help(hysevt.clustering.evaluation.calc_cluster_zscore_averages)
help(hysevt.clustering.evaluation.calc_cluster_zscore_error_range)
Help on function calc_metric_zscores in module hysevt.clustering.evaluation: calc_metric_zscores(metrics, cluster_labels, drop_cols=['start', 'end']) Help on function calc_cluster_zscore_averages in module hysevt.clustering.evaluation: calc_cluster_zscore_averages(metric_zscores, mode='median', cluster_id_col='cluster_id') Help on function calc_cluster_zscore_error_range in module hysevt.clustering.evaluation: calc_cluster_zscore_error_range(metric_zscores, error_range='full', cluster_id_col='cluster_id')
fig, ax = plt.subplots(figsize=(5, 4))
for c, color in zip(range(k), cluster_colors):
event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
x="Qtotal",
y="SSY",
ax=ax,
edgecolor=color,
color="none",
label=f"Cluster {c}",
logx=False,
logy=False,
)
# ax.set_title(f"Cluster {c} (n={(event_metrics_labels.cluster_id == c).sum()})")
ax.set_xlabel("Total streamflow volume [m3]")
ax.set_ylabel("Event SSY [t]")
ax.legend()
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_scatter_SSY_Qtotal.png"),
dpi=300,
bbox_inches="tight",
)
fig, ax = plt.subplots(figsize=(9, 4), ncols=2, sharey=True)
for c, color in zip(range(k), cluster_colors):
event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
x="Qtotal",
y="SSY",
ax=ax[0],
edgecolor=color,
color="none",
label=f"Cluster {c}",
logx=False,
logy=False,
)
ax[0].set_xlabel("Total streamflow volume [$m^3$]")
ax[0].set_ylabel("Event SSY [$t$]")
ax[0].legend()
for c, color in zip(range(k), cluster_colors):
event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
x="seasonal_timing",
y="SSY",
ax=ax[1],
edgecolor=color,
color="none",
label=f"Cluster {c}",
legend=False,
)
ax[1].set_xlabel("Day of year")
ax[1].set_ylabel("")
plt.tight_layout()
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_scatter.png"),
dpi=300,
bbox_inches="tight",
)
fig, ax = plt.subplots(figsize=(5, 4))
for c, color in zip(range(k), cluster_colors):
event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
x="seasonal_timing",
y="SSY",
ax=ax,
edgecolor=color,
color="none",
label=f"Cluster {c}",
)
# ax.set_title(f"Cluster {c} (n={(event_metrics_labels.cluster_id == c).sum()})")
ax.set_xlabel("Day of year")
ax.legend()
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_scatter_SSY_DOY.png"),
dpi=300,
bbox_inches="tight",
)
for c, color in zip(range(k), cluster_colors):
fig, ax = plt.subplots()
event_metrics_labels[event_metrics_labels.cluster_id != c].plot.scatter(
x="seasonal_timing", y="SSY", color="grey", alpha=0.2, ax=ax
)
event_metrics_labels[event_metrics_labels.cluster_id == c].plot.scatter(
x="seasonal_timing", y="SSY", ax=ax, color=color
)
ax.set_title(f"Cluster {c} (n={(event_metrics_labels.cluster_id == c).sum()})")
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_scatter_cluster{c}.png"),
dpi=300,
bbox_inches="tight",
)
event_metrics_cluster_prob = event_metrics.join(cluster_prob)
training_data_cluster_prob = training_data.join(cluster_prob)
def plot_cluster_prob(features_with_cluster_prob, k, x, y, x_label=None, y_label=None):
seq_cmaps = [
"Purples",
"Blues",
"Greens",
"Oranges",
"Reds",
"YlOrBr",
"YlOrRd",
"OrRd",
"PuRd",
"RdPu",
"BuPu",
"GnBu",
"PuBu",
"YlGnBu",
"PuBuGn",
"BuGn",
"YlGn",
]
if x_label is None:
x_label = x
if y_label is None:
y_label = y
fig, ax = plt.subplots(
ncols=k // 2, nrows=2, figsize=(2.5 * k, 3 * 2), sharex=True, sharey=True
)
ax = ax.ravel()
for c, cmap in zip(range(k), seq_cmaps[:k]):
features_with_cluster_prob.plot.scatter(
ax=ax[c],
x=x,
y=y,
c=f"prob_cluster_{c}",
cmap=cmap,
vmin=0,
vmax=1,
alpha=0.5,
edgecolor="none",
)
ax[c].set_title(f"Cluster {c}")
ax[c].set_ylabel("")
fig.text(0.5, -0.03, x_label, fontsize=18)
fig.text(-0.03, 0.5, y_label, fontsize=18, rotation="vertical")
plt.tight_layout()
x = "Qtotal"
y = "SSY"
plot_cluster_prob(event_metrics_cluster_prob, k, x=x, y=y)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
dpi=300,
bbox_inches="tight",
)
x = "SHI"
y = "SSY"
plot_cluster_prob(event_metrics_cluster_prob, k, x=x, y=y)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
dpi=300,
bbox_inches="tight",
)
x = "seasonal_timing"
y = "SSY"
plot_cluster_prob(event_metrics_cluster_prob, k, x=x, y=y)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
dpi=300,
bbox_inches="tight",
)
if input_data == "reduced":
x = "PC1"
y = "PC2"
plot_cluster_prob(training_data_cluster_prob, k, x=x, y=y)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
dpi=300,
bbox_inches="tight",
)
x = "PC1"
y = "PC3"
plot_cluster_prob(training_data_cluster_prob, k, x=x, y=y)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
dpi=300,
bbox_inches="tight",
)
x = "PC1"
y = "PC4"
plot_cluster_prob(training_data_cluster_prob, k, x=x, y=y)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_cluster_prob_{x}_vs_{y}.png"),
dpi=300,
bbox_inches="tight",
)
cluster_representative = []
for c in range(k):
# get event with highest probability
event_id = random.choice(
list(cluster_prob[cluster_prob[f"prob_cluster_{c}"] > 0.9].index)
)
cluster_representative.append(event_id)
cluster_representative
['VEN-AUT-2015-037', 'VEN-AUT-2006-075', 'VEN-AUT-2009-030', 'VEN-AUT-2007-085']
fig, ax = plt.subplots(nrows=2, ncols=k, figsize=(k * 3, 4), sharey=False)
for c in range(k):
event = event_metrics.loc[cluster_representative[c]]
event_series = hysevt.events.metrics.get_event_data(
event.start, event.end, gauge_data
)
visualise.plotEventSeries(event_series, ax=ax[0][c])
visualise.plotEventHysteresis(event_series, ax=ax[1][c])
# ax[0][c].set_title(f"{event.start:%Y-%m-%d},duration: {event.duration} h")
ax[0][c].set_title(f"Cluster {c}: Example", fontsize=16)
ax[0][c].set_xticks((ax[0][c].get_xticks()[0], ax[0][c].get_xticks()[-1]))
plt.tight_layout()
plt.savefig(
ODIR.joinpath(
f"example_events_with_hysteresis_{'_'.join(cluster_representative)}_EGUposter.png"
),
dpi=600,
bbox_inches="tight",
)
for c in range(k):
if ODIR.joinpath(f"cluster{c}_event_hystersis_plots.pdf").is_file():
continue
with PdfPages(ODIR.joinpath(f"cluster{c}_event_hystersis_plots.pdf")) as pdf:
for event_id in labels[labels == c].index.to_list():
fig, ax = plt.subplots(ncols=2, figsize=(8, 3), sharey=False)
event = event_metrics.loc[event_id]
event_series = hysevt.events.metrics.get_event_data(
event.start, event.end, gauge_data
)
visualise.plotEventSeries(event_series, ax=ax[0])
visualise.plotEventHysteresis(event_series, ax=ax[1])
ax[0].set_title(
f"{event_id}, {event.start:%Y-%m-%d %H:%M}, {event.duration:.0f} hours"
)
fig.suptitle(
f"Probability cluster {c}: {event_metrics_cluster_prob[f'prob_cluster_{c}'][event_id]:.2f}",
color=cluster_colors[c],
)
plt.tight_layout()
pdf.savefig()
plt.close()
gmm_parameters = (
pd.DataFrame(
model.means_,
columns=[f"Mean_{col}" for col in training_data.columns],
index=[f"Cluster{c}" for c in range(k)],
)
.join(
pd.DataFrame(
model.covariances_,
columns=["Variance"],
index=[f"Cluster{c}" for c in range(k)],
)
)
.join(
pd.DataFrame(
model.weights_, columns=["Weight"], index=[f"Cluster{c}" for c in range(k)]
)
)
)
gmm_parameters
| Mean_PC1 | Mean_PC2 | Mean_PC3 | Mean_PC4 | Mean_PC5 | Mean_PC6 | Mean_PC7 | Variance | Weight | |
|---|---|---|---|---|---|---|---|---|---|
| Cluster0 | -3.451822 | 0.741523 | 0.577830 | 0.234283 | 0.281182 | 0.150651 | -0.142495 | 1.032032 | 0.112743 |
| Cluster1 | 1.898625 | 0.136341 | -0.325492 | 0.613008 | 0.537588 | -0.253704 | -0.145181 | 0.657455 | 0.199729 |
| Cluster2 | -0.779855 | -0.302831 | -0.445886 | 0.027444 | -0.328718 | 0.054580 | 0.073223 | 0.645718 | 0.469115 |
| Cluster3 | 1.720595 | 0.142985 | 0.957063 | -0.740445 | 0.069288 | 0.037007 | 0.049044 | 3.643744 | 0.218413 |
gmm_parameters.to_csv(ODIR.joinpath("GMM_parameters.csv"))
help(visualise.plot_gmm)
Help on function plot_gmm in module hysevt.utils.visualise: plot_gmm(X, Y_, means, covariances, cluster_colors, alpha=0, no_ticks=False, ax=None, xy=None, e=3)
dim = len(training_data.columns)
with PdfPages(ODIR.joinpath(f"GMM_cluster_mean_variance.pdf")) as pdf:
for i in range(dim):
for j in range(i + 1, dim):
visualise.plot_gmm(
X_train,
model.predict(X_train),
model.means_[:, [i, j]],
model.covariances_,
cluster_colors=cluster_colors,
xy=(i, j),
alpha=0.2,
e=(X_train[:, [i, j]].max() - X_train[:, [i, j]].min()) / 7,
)
plt.xlabel(f"PC{i+1}")
plt.ylabel(f"PC{j+1}")
pdf.savefig()
X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names X does not have valid feature names, but GaussianMixture was fitted with feature names More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
event_metrics.drop(columns=["start", "end"]).hist(figsize=(10, 10))
plt.tight_layout()
# calculate zscores
event_metrics_zscores = event_metrics.drop(columns=["start", "end"]).apply(
partial(scipy.stats.zscore, nan_policy="omit"), axis=0
)
# remove some columns and add cluster labels
event_metrics_zscores = event_metrics_zscores.drop(
columns=[
"year",
"pSSY",
"pQtotal",
"n_SSC_peaks",
"n_Q_peaks",
"last_event_Qtotal_elapsed_time_logratio",
"time_since_last_event",
]
).join(
labels
) # add cluster labels
cluster_zscores_average = hysevt.clustering.evaluation.calc_cluster_zscore_averages(
event_metrics_zscores, mode="median"
)
event_metrics_zscores.hist(figsize=(10, 10))
plt.tight_layout()
event_metrics_zscores.to_csv(
ODIR.joinpath(f"best_cluster_number_{input_data}_results_zscores.csv")
)
cluster_zscores_average.to_csv(
ODIR.joinpath(f"best_cluster_number_{input_data}_results_zscore_averages.csv")
)
em_col_order = [
"seasonal_timing",
"duration",
"SSY_falling_to_rising_ratio",
"SQPR",
"peak_phase_diff",
"SHI",
"AHI",
"last_event_SSY_elapsed_time_logratio",
"Q_max_previous_ratio",
"SSY",
"SSC_max",
"SSC_mean_weighted",
"SSC_mean",
"Q_mean",
"Q_max",
"Qtotal",
]
em_col_order_labels = [
r"$DOY$",
r"$\Delta t$",
r"$SSY_{ratio}$",
r"$SQPR$",
r"$phi_{peak}$",
r"$SHI$",
r"$AHI$",
r"$IEI$",
r"$Q_{peak,ratio}$",
r"$SSY$",
r"$SSC_{max}$",
r"$SSC_{mean,w}$",
r"$SSC_{mean}$",
r"$Q_{mean}$",
r"$Q_{max}$",
r"$Q_{total}$",
]
em_colors = [
"darkslategrey",
"darkslategrey",
"green",
"green",
"green",
"green",
"green",
"darkgoldenrod",
"darkgoldenrod",
"brown",
"brown",
"brown",
"brown",
"blue",
"blue",
"blue",
]
ax = visualise.violinplot_cluster_zscore(
event_metrics_zscores,
k,
col_order=em_col_order,
col_order_labels=em_col_order_labels,
col_colors=em_colors,
cluster_colors=cluster_colors,
)
plt.savefig(
ODIR.joinpath(f"event_metrics_zscore_violins.png"),
dpi=600,
bbox_inches="tight",
)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, len(em_col_order) / 3),
sharey=True,
sharex=True,
)
for c in range(k):
visualise.plot_cluster_zscore_averages(
cluster_zscores_average.loc[:, em_col_order],
c,
ax=ax[c],
color=em_colors,
)
ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), em_col_order_labels)
plt.tight_layout()
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_zscores_averages.png"),
dpi=300,
bbox_inches="tight",
)
lower, upper = hysevt.clustering.evaluation.calc_cluster_zscore_error_range(
event_metrics_zscores, error_range=(0.25, 0.75)
)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, len(em_col_order) / 4),
sharey=True,
sharex=True,
)
for c in range(k):
visualise.plot_cluster_zscore_averages_with_error_range(
cluster_zscores_average.loc[:, em_col_order],
lower.loc[:, em_col_order],
upper.loc[:, em_col_order],
c,
ax=ax[c],
color=em_colors,
error_color="lightgrey",
)
ax[c].set_title(f"Cluster {c}", fontsize=16)
ax[0].set_xlim(-1.2, 1.8)
ax[0].set_xticks((-1, 1))
ax[0].set_yticks(ax[0].get_yticks(), em_col_order_labels)
plt.tight_layout()
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_zscores_averages_error_bars_EGUposter.png"
),
dpi=600,
bbox_inches="tight",
)
with PdfPages(ODIR.joinpath(f"best_cluster_number_results_{input_data}.pdf")) as pdf:
visualise.plot_cluster_result_with_zscore_averages(
data, labels.values, cluster_zscores_average
)
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_results_{input_data}_with_zscore_averages.png"
),
dpi=300,
bbox_inches="tight",
)
pdf.savefig()
visualise.plot_cluster_zscores(event_metrics_zscores)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_zscores.png"),
dpi=300,
bbox_inches="tight",
)
pdf.savefig()
fig_list = visualise.quickplot_clustering_results(event_metrics.join(labels))
[pdf.savefig(fig) for fig in fig_list]
To output multiple subplots, the figure containing the passed axes is being cleared.
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Input In [42], in <cell line: 1>() 14 plt.savefig( 15 ODIR.joinpath(f"best_cluster_number_{input_data}_zscores.png"), 16 dpi=300, 17 bbox_inches="tight", 18 ) 19 pdf.savefig() ---> 20 fig_list = visualise.quickplot_clustering_results(event_metrics.join(labels)) 21 [pdf.savefig(fig) for fig in fig_list] File ~/Documents/NRC_P8_water_energy_and_sediment/projects/water_sediment_pulses/src/hysevt/hysevt/utils/visualise.py:320, in quickplot_clustering_results(clustering_results, with_year_column) 316 fig, ax = plt.subplots(ncols=2) 317 clustering_results.boxplot( 318 column="last_event_SSY_elapsed_time_logratio", by="cluster_id", grid=False, color="grey", ax=ax[0] 319 ) --> 320 clustering_results.boxplot( 321 column="SSC_to_Q_peak_logratio", by="cluster_id", grid=False, color="grey", ax=ax[1] 322 ) 323 plt.tight_layout() 324 fig_list.append(fig) File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_core.py:511, in boxplot_frame(self, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, backend, **kwargs) 494 @Substitution(backend=_backend_doc) 495 @Appender(_boxplot_doc) 496 def boxplot_frame( (...) 508 **kwargs, 509 ): 510 plot_backend = _get_plot_backend(backend) --> 511 return plot_backend.boxplot_frame( 512 self, 513 column=column, 514 by=by, 515 ax=ax, 516 fontsize=fontsize, 517 rot=rot, 518 grid=grid, 519 figsize=figsize, 520 layout=layout, 521 return_type=return_type, 522 **kwargs, 523 ) File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_matplotlib/boxplot.py:425, in boxplot_frame(self, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds) 410 def boxplot_frame( 411 self, 412 column=None, (...) 421 **kwds, 422 ): 423 import matplotlib.pyplot as plt --> 425 ax = boxplot( 426 self, 427 column=column, 428 by=by, 429 ax=ax, 430 fontsize=fontsize, 431 grid=grid, 432 rot=rot, 433 figsize=figsize, 434 layout=layout, 435 return_type=return_type, 436 **kwds, 437 ) 438 plt.draw_if_interactive() 439 return ax File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_matplotlib/boxplot.py:372, in boxplot(data, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds) 367 columns = [column] 369 if by is not None: 370 # Prefer array return type for 2-D plots to match the subplot layout 371 # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580 --> 372 result = _grouped_plot_by_column( 373 plot_group, 374 data, 375 columns=columns, 376 by=by, 377 grid=grid, 378 figsize=figsize, 379 ax=ax, 380 layout=layout, 381 return_type=return_type, 382 ) 383 else: 384 if return_type is None: File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/plotting/_matplotlib/boxplot.py:249, in _grouped_plot_by_column(plotf, data, columns, by, numeric_only, grid, figsize, ax, layout, return_type, **kwargs) 247 for i, col in enumerate(columns): 248 ax = _axes[i] --> 249 gp_col = grouped[col] 250 keys, values = zip(*gp_col) 251 re_plotf = plotf(keys, values, ax, **kwargs) File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1338, in DataFrameGroupBy.__getitem__(self, key) 1329 if isinstance(key, tuple) and len(key) > 1: 1330 # if len == 1, then it becomes a SeriesGroupBy and this is actually 1331 # valid syntax, so don't raise warning 1332 warnings.warn( 1333 "Indexing with multiple keys (implicitly converted to a tuple " 1334 "of keys) will be deprecated, use a list instead.", 1335 FutureWarning, 1336 stacklevel=find_stack_level(), 1337 ) -> 1338 return super().__getitem__(key) File ~/anaconda3/envs/pulses/lib/python3.8/site-packages/pandas/core/base.py:250, in SelectionMixin.__getitem__(self, key) 248 else: 249 if key not in self.obj: --> 250 raise KeyError(f"Column not found: {key}") 251 subset = self.obj[key] 252 ndim = subset.ndim KeyError: 'Column not found: SSC_to_Q_peak_logratio'
# plot just event shapes
fig, ax = plt.subplots(
ncols=k,
nrows=3,
figsize=(3 * k, 3 * 3),
sharey=True,
sharex=False,
)
for c in range(k):
visualise.plot_single_cluster(data, labels.values, c, ax=ax[:, c], ylabels=c == 0)
plt.savefig(ODIR.joinpath("event_cluster_shape_and_hysteresis.png"), dpi=600)
# plot just event shapes
fig, ax = plt.subplots(
ncols=k,
nrows=3,
figsize=(2 * k, 1.5 * 3),
sharey=True,
sharex=False,
)
for c in range(k):
visualise.plot_single_cluster(data, labels.values, c, ax=ax[:, c], ylabels=False)
ax[1, 1].set_xlabel(" " * 50 + "Standardised event length")
ax[0, 0].set_ylabel("Q (standardised)", color="blue")
ax[1, 0].set_ylabel("SSC (standardised)", color="brown")
ax[1, 0].set_yticks((0, 1))
for a in ax[:2, :].ravel():
a.set_xticks((0, 49))
a.set_xticklabels((1, 50))
ax[1, 0].set_ylim(0, 1)
plt.tight_layout()
# plt.savefig(ODIR.joinpath("event_cluster_shape_EGUposter.png"), dpi=600)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, 3),
sharey=True,
sharex=False,
)
alpha = 0.1
for c in range(k):
cluster_average = np.median(data[labels == c], axis=0)
for e in range((labels == c).sum()):
ax[c].scatter(
x=data[labels == c, 0, :][e],
y=data[labels == c, 1, :][e],
c=range(50),
cmap="copper",
s=30,
alpha=alpha,
edgecolor="none",
)
ax[c].plot(cluster_average[0], cluster_average[1], color="black")
plt.savefig(ODIR.joinpath("event_cluster_hysteresis.png"), dpi=600)
catchment_water_metrics = pd.read_csv(
eventsDIR.joinpath("event_catchment_water_metrics.csv"), index_col=0
)
catchment_energy_metrics = pd.read_csv(
eventsDIR.joinpath("event_catchment_energy_metrics.csv"), index_col=0
)
catchment_water_metrics_zscores = hysevt.clustering.evaluation.calc_metric_zscores(
catchment_water_metrics,
labels,
drop_cols=[
"start",
"end",
"pre_start",
"NAPI7_SPARTACUS",
"NAPI7_INCA",
],
)
catchment_water_metrics_zscores_average = (
hysevt.clustering.evaluation.calc_cluster_zscore_averages(
catchment_water_metrics_zscores
)
)
(
catchment_water_metrics_zscores_lower,
catchment_water_metrics_zscores_upper,
) = hysevt.clustering.evaluation.calc_cluster_zscore_error_range(
catchment_water_metrics_zscores, error_range=(0.25, 0.75)
)
catchment_energy_metrics_zscores = hysevt.clustering.evaluation.calc_metric_zscores(
catchment_energy_metrics,
labels,
drop_cols=[
"start",
"end",
"event_Tmean_INCA",
"event_Tmax_INCA",
"ATmean5_INCA",
"ATmean7_INCA",
"ATmean14_INCA",
"ATmean5_SPARTACUS",
"ATmean7_SPARTACUS",
"ATmean14_SPARTACUS",
"AFI5_INCA",
"ATI5_INCA",
"AFI7_INCA",
"ATI7_INCA",
"AFI7_SPARTACUS",
"ATI7_SPARTACUS",
"AFI14_INCA",
"ATI14_INCA",
"AFI30_INCA",
"ATI30_INCA",
"AFI14_SPARTACUS",
"ATI14_SPARTACUS",
"AFI30_SPARTACUS",
"AFI5_SPARTACUS",
],
)
catchment_energy_metrics_zscores_average = (
hysevt.clustering.evaluation.calc_cluster_zscore_averages(
catchment_energy_metrics_zscores, mode="mean"
)
)
(
catchment_energy_metrics_zscores_lower,
catchment_energy_metrics_zscores_upper,
) = hysevt.clustering.evaluation.calc_cluster_zscore_error_range(
catchment_energy_metrics_zscores, error_range=(0.1, 0.9)
)
catchment_metrics_zscores_average = catchment_energy_metrics_zscores_average.join(
catchment_water_metrics_zscores_average
)
catchment_metrics_zscores_lower = catchment_energy_metrics_zscores_lower.join(
catchment_water_metrics_zscores_lower
)
catchment_metrics_zscores_upper = catchment_energy_metrics_zscores_upper.join(
catchment_water_metrics_zscores_upper
)
col_order = [
"fSCA",
"ACDA_SPARTACUS",
"event_global_radiation",
"event_Tmean_SPARTACUS",
"event_Tmax_SPARTACUS",
"antecedent_global_radiation",
"ATI5_SPARTACUS",
"ATI30_SPARTACUS",
"SCA_change",
"event_precip_sum_SPARTACUS",
"event_precip_sum_INCA",
"event_max_precip_intensity_SPARTACUS",
"event_max_precip_intensity_INCA",
"NAPI5_SPARTACUS",
"NAPI14_SPARTACUS",
]
col_order_labels = [
"fSCA",
"ACDA",
"$GR_{event}$",
"$T_{mean}$",
"$T_{max}$",
"AGR",
"ATI5",
"ATI30",
"SCA_change",
"$P_{total}$ (daily)",
"$P_{total}$ (hourly)",
"$I_{max}$ (daily)",
"$I_{max}$ (hourly)",
"NAPI5",
"NAPI14",
]
colors = [
"grey",
"grey",
"hotpink",
"red",
"maroon",
"salmon",
"darksalmon",
"darksalmon",
"aqua",
"blue",
"blue",
"darkblue",
"darkblue",
"slategrey",
"slategrey",
]
ax = visualise.violinplot_cluster_zscore(
event_metrics_zscores=catchment_energy_metrics_zscores.join(
catchment_water_metrics_zscores.drop(columns=["cluster_id"])
),
k=k,
col_order=col_order,
col_colors=colors,
col_order_labels=col_order_labels,
cluster_colors=cluster_colors,
)
plt.savefig(
ODIR.joinpath(f"catchment_metrics_zscore_violins.png"),
dpi=600,
bbox_inches="tight",
)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 3),
sharey=True,
sharex=True,
)
for c in range(k):
visualise.plot_cluster_zscore_averages(
catchment_metrics_zscores_average.loc[:, col_order],
c,
ax=ax[c],
color=colors,
)
ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
plt.tight_layout()
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_results_with_catchment_zscores.png"
),
dpi=300,
)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 3),
sharey=True,
sharex=True,
)
for c in range(k):
visualise.plot_cluster_zscore_averages_with_error_range(
catchment_metrics_zscores_average.loc[:, col_order],
catchment_metrics_zscores_lower.loc[:, col_order],
catchment_metrics_zscores_upper.loc[:, col_order],
c,
ax=ax[c],
color=colors,
set_xlim_to_global=False,
)
ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
plt.tight_layout()
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_results_with_catchment_zscores_with_IQRerror_bars_version1.png"
),
dpi=300,
)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 3),
sharey=True,
sharex=True,
)
for c in range(k):
visualise.plot_cluster_zscore_averages_with_error_range(
catchment_metrics_zscores_average.loc[:, col_order],
catchment_metrics_zscores_lower.loc[:, col_order],
catchment_metrics_zscores_upper.loc[:, col_order],
c,
ax=ax[c],
color=colors,
set_xlim_to_global=True,
)
ax[c].set_title(f"Cluster {c}")
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
plt.tight_layout()
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_results_with_catchment_zscores_with_IQRerror_bars_version2.png"
),
dpi=300,
)
fig, ax = plt.subplots(
ncols=k,
figsize=(3 * k, catchment_metrics_zscores_average.shape[1] / 4),
sharey=True,
sharex=True,
)
for c in range(k):
visualise.plot_cluster_zscore_averages_with_error_range(
catchment_metrics_zscores_average.loc[:, col_order],
catchment_metrics_zscores_lower.loc[:, col_order],
catchment_metrics_zscores_upper.loc[:, col_order],
c,
ax=ax[c],
color=colors,
set_xlim_to_global=False,
error_color="lightgrey",
)
ax[c].set_title(f"Cluster {c}", fontsize=16)
ax[0].set_yticks(ax[0].get_yticks(), col_order_labels)
ax[0].set_xlim(-1.2, 1.2)
ax[0].set_xticks((-1.5, 1))
plt.tight_layout()
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_results_with_catchment_zscores_with_IQRerror_bars_EGUposter.png"
),
dpi=300,
)
cluster_representative
['VEN-AUT-2015-037', 'VEN-AUT-2006-075', 'VEN-AUT-2009-030', 'VEN-AUT-2007-085']
catchment_energy_metrics.drop(columns=["start", "end"]).join(
catchment_water_metrics.drop(columns=["start", "end", "pre_start"])
).loc[cluster_representative, col_order].plot.bar(
color=colors, figsize=(20, 5), legend=False
)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
<matplotlib.legend.Legend at 0x7fe1301feb20>
cluster_representative
['VEN-AUT-2015-037', 'VEN-AUT-2006-075', 'VEN-AUT-2009-030', 'VEN-AUT-2007-085']
fig, ax = plt.subplots(
figsize=(6, 10),
)
catchment_energy_metrics_zscores.join(
catchment_water_metrics_zscores.drop(columns=["cluster_id"])
).loc[cluster_representative, col_order].plot.barh(
ax=ax, color=colors, legend=False, label="cluster_id"
)
plt.yticks(
range(len(cluster_representative)),
[f"Cluster{i}" for i in range(len(cluster_representative))],
)
handles, llabels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], llabels[::-1], bbox_to_anchor=(1, 1), loc="upper left")
<matplotlib.legend.Legend at 0x7fe11bd5e820>
all_metrics = (
catchment_energy_metrics.drop(columns=["start", "end"])
.join(catchment_water_metrics.drop(columns=["start", "end", "pre_start"]))[
col_order
]
.join(
event_metrics_labels[
[
"duration",
"seasonal_timing",
"SSY",
"SSC_max",
"SSC_mean_weighted",
"Qtotal",
"Q_max",
"Q_mean",
"SSY_falling_to_rising_ratio",
"Q_max_previous_ratio",
"last_event_SSC_max_elapsed_time_logratio",
"peak_phase_diff",
"SQPR",
"SHI",
"AHI",
"cluster_id",
]
]
)
)
fig, ax = plt.subplots(
ncols=5, nrows=int(len(all_metrics.columns.to_list()[:-1]) / 5), figsize=(15, 15)
)
ax = ax.ravel()
for i, m in enumerate(all_metrics.columns.to_list()[:-1]):
all_metrics.pivot(columns="cluster_id")[m].drop(columns=[np.nan]).plot(
kind="kde",
stacked=True,
color=cluster_colors,
ax=ax[i],
legend=False,
logx=m
in [
"SSY",
"SSC_max",
"SSC_mean_weighted",
"Qtotal",
"Q_max",
"Q_mean",
],
)
ax[i].set_xlim(all_metrics[m].min(), all_metrics[m].max())
ax[i].set_title(m)
plt.tight_layout()
ax[4].legend(bbox_to_anchor=(1, 1), loc="upper left", title="Cluster")
plt.savefig(
ODIR.joinpath(f"cluster_{input_data}_results_all_metrics_kde.png"),
dpi=300,
bbox_inches="tight",
)
Making the clustering explainable with SHAP.
https://towardsdatascience.com/how-to-make-clustering-explainable-1582390476cc
explainer = shap.Explainer(model.predict, training_data)
shap_values = explainer(training_data)
# example event with highest cluster prob from each cluster
for c in range(k):
# get event with highest probability
event_id = random.choice(
list(
cluster_prob[
cluster_prob[f"prob_cluster_{c}"]
== cluster_prob[f"prob_cluster_{c}"].max()
].index
)
)
i = np.where(training_data.index == event_id)[0][0]
# plot
plt.figure()
shap.plots.waterfall(shap_values[i], show=False)
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_results_SHAP_cluster{c}_example_{event_id}.png"
),
dpi=300,
bbox_inches="tight",
)
# all events
shap.plots.beeswarm(shap_values, show=False)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_results_SHAP.png"),
dpi=300,
bbox_inches="tight",
)
for c, train in training_data.join(labels).groupby("cluster_id"):
shap_values = explainer(train.drop(columns=["cluster_id"]))
plt.figure()
shap.plots.beeswarm(shap_values, show=False)
plt.savefig(
ODIR.joinpath(f"best_cluster_number_{input_data}_results_SHAP_cluster{c}.png"),
dpi=300,
bbox_inches="tight",
)
size = (len(em_col_order) + len(col_order)) / 2
gs = mpl.gridspec.GridSpec(
2, k, width_ratios=[1] * k, height_ratios=[len(em_col_order), len(col_order)]
)
fig = plt.figure(figsize=(size, 3 * size / k))
em_ax = np.array([fig.add_subplot(gs[c]) for c in range(k)])
cm_ax = np.array([fig.add_subplot(gs[c]) for c in range(k, k * 2)])
# event metrics
for c, sub in event_metrics_zscores.groupby("cluster_id"):
for j, m in enumerate(em_col_order):
violins = em_ax[int(c)].violinplot(
sub.loc[:, m].dropna(),
positions=[j + 1],
vert=False,
showmedians=False,
showmeans=True,
widths=0.9,
)
violins["bodies"][0].set_edgecolor(em_colors[j])
violins["bodies"][0].set_facecolor(em_colors[j])
violins["bodies"][0].set_alpha(0.7)
for partname in ("cbars", "cmins", "cmaxes", "cmeans"):
vp = violins[partname]
vp.set_edgecolor(em_colors[j])
vp.set_linewidth(2 if partname == "cmeans" else 1)
em_ax[int(c)].vlines(
0, 0, len(em_col_order) + 1, color="grey", alpha=0.5, linewidth=1
)
em_ax[int(c)].set_xlabel("Z-Score")
em_ax[int(c)].set_title(
f"Cluster{int(c)}", color=cluster_colors[int(c)], fontsize=16
)
em_ax[int(c)].set_yticks(range(1, len(em_col_order) + 1), [""] * len(em_col_order))
em_ax[int(c)].set_ylim(0, len(em_col_order) + 1)
em_ax[int(c)].set_xlim(-7, 7)
em_ax[0].set_yticks(range(1, len(em_col_order) + 1), em_col_order_labels)
# catchment metrics
for c, sub in catchment_energy_metrics_zscores.join(
catchment_water_metrics_zscores.drop(columns=["cluster_id"])
).groupby("cluster_id"):
for j, m in enumerate(col_order):
violins = cm_ax[int(c)].violinplot(
sub.loc[:, m].dropna(),
positions=[j + 1],
vert=False,
showmedians=False,
showmeans=True,
widths=0.9,
)
violins["bodies"][0].set_edgecolor(colors[j])
violins["bodies"][0].set_facecolor(colors[j])
violins["bodies"][0].set_alpha(0.7)
for partname in ("cbars", "cmins", "cmaxes", "cmeans"):
vp = violins[partname]
vp.set_edgecolor(colors[j])
vp.set_linewidth(2 if partname == "cmeans" else 1)
cm_ax[int(c)].vlines(0, 0, 16, color="grey", alpha=0.5, linewidth=1)
cm_ax[int(c)].set_xlabel("Z-Score")
cm_ax[int(c)].set_ylim(0, len(col_order) + 1)
cm_ax[int(c)].set_yticks(range(1, len(col_order) + 1), [""] * len(col_order))
cm_ax[int(c)].set_xlim(-7, 7)
cm_ax[0].set_yticks(range(1, len(col_order) + 1), col_order_labels)
plt.tight_layout()
plt.savefig(
ODIR.joinpath(f"event_catchment_metrics_violinplots.png"),
dpi=300,
bbox_inches="tight",
)
event_type_yield = (
event_metrics_labels.groupby(["year", "cluster_id"], dropna=True)
.SSY.sum()
.unstack()
)
event_type_yield.columns = [f"type_{int(col)}" for col in event_type_yield.columns]
annual_SSY_event_type = annual_SSY.join(
event_type_yield.sum(axis=1).rename("total_event_yield")
)
annual_SSY_event_type["non_event_yield"] = (
annual_SSY_event_type.sediment_yield - annual_SSY_event_type.total_event_yield
)
event_type_yield = event_type_yield.join(annual_SSY_event_type["non_event_yield"])
event_type_yield.to_csv(ODIR.joinpath("event_type_annual_sediment_yield.csv"))
# set new column names
event_type_yield.columns = [f"Cluster {c}" for c in range(k)] + [
"non-events",
]
event_type_yield.divide(annual_SSY_event_type.sediment_yield, axis="rows").mean(axis=0)
Cluster 0 0.242324 Cluster 1 0.062559 Cluster 2 0.359808 Cluster 3 0.131030 non-events 0.236588 dtype: float64
fig, ax = plt.subplots(figsize=(15, 5), ncols=2)
event_type_yield.plot.bar(
stacked=True,
ax=ax[0],
color=np.vstack((cluster_colors, np.array(plt.cm.gray(0.5)).reshape(1, 4))),
)
ax[0].set_ylabel("Annual suspended sediment yield [t]")
event_type_yield.mean(axis=0).plot.pie(
ax=ax[1],
autopct="%1.1f%%",
colors=np.vstack((cluster_colors, np.array(plt.cm.gray(0.5)).reshape(1, 4))),
legend=False,
)
ax[0].legend(loc="upper left", bbox_to_anchor=(1, 1))
ax[1].set_ylabel("")
plt.savefig(
ODIR.joinpath(
f"best_cluster_number_{input_data}_annual_sediment_yield_event_types.png"
),
dpi=600,
bbox_inches="tight",
)